{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "import matplotlib.pyplot as plt\n",
    "from keras.models import Sequential\n",
    "from keras.layers import Dense\n",
    "from keras.callbacks import EarlyStopping\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from keras.optimizers import SGD\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score\n",
    "from sklearn.metrics import roc_auc_score, average_precision_score\n",
    "from scipy.stats import ks_2samp\n",
    "from keras.wrappers.scikit_learn import KerasClassifier\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, VotingClassifier\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = pd.read_csv('input/X_train_res.csv').drop(columns=['Unnamed: 0'])\n",
    "y_train = pd.read_csv('input/y_smote.csv').drop(columns=['Unnamed: 0'])\n",
    "X_val = pd.read_csv('input/X_val.csv').drop(columns=['INDEX'])\n",
    "y_val = pd.read_csv('input/y_val.csv', names=['INDEX', 'IND_BOM_1_1']).drop(columns=['INDEX'])\n",
    "X_test = pd.read_csv('input/X_test.csv').drop(columns=['INDEX'])\n",
    "y_test = pd.read_csv('input/y_test.csv', names=['INDEX', 'IND_BOM_1_1']).drop(columns=['INDEX'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "scaler = StandardScaler()\n",
    "X_train = scaler.fit_transform(X_train)\n",
    "X_val = scaler.transform(X_val)\n",
    "X_test = scaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_dim = len(list(X_train_res)) # Dimensão de entrada do dataset\n",
    "\n",
    "'''\n",
    "    * Método para calcular a distribuição acumulada das classes, de 0.0 a 1.0\n",
    "    * Utilizado para o cálculo do KS\n",
    "'''\n",
    "def calc_distr(y_true:pd.DataFrame, y_pred_proba:np.array):\n",
    "    ac_distr_0 = np.zeros(101)\n",
    "    ac_distr_1 = np.zeros(101)\n",
    "    count_classes = y_true['IND_BOM_1_1'].value_counts()\n",
    "    for i in range(1, 101):\n",
    "        lim = i/100.0\n",
    "        ac_classes = y_true[y_pred_proba <= lim]['IND_BOM_1_1'].value_counts()\n",
    "        ac_distr_0[i] += ac_classes.get(0, 0) \n",
    "        ac_distr_1[i] += ac_classes.get(1, 0)\n",
    "    return (ac_distr_0/count_classes[0], ac_distr_1/count_classes[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "    * Método para calcular as métricas a serem impressas\n",
    "'''\n",
    "\n",
    "def calc_metrics(y_true, y_pred, y_pred_proba):\n",
    "    cm = confusion_matrix(y_true, y_pred)\n",
    "    precision = precision_score(y_true, y_pred)\n",
    "    accuracy = accuracy_score(y_true, y_pred)\n",
    "    f1 = f1_score(y_true, y_pred)\n",
    "    recall = recall_score(y_true, y_pred)\n",
    "    metrics = {\n",
    "        'cm': cm,\n",
    "        'precision': precision,\n",
    "        'accuracy': accuracy,\n",
    "        'f1': f1,\n",
    "        'recall': recall\n",
    "    }\n",
    "    if not y_pred_proba is None:\n",
    "        ac_distr0, ac_distr1 = calc_distr(y_true, y_pred_proba)\n",
    "        ks = ks_2samp(ac_distr0, ac_distr1)\n",
    "        auroc = roc_auc_score(y_true, y_pred_proba)\n",
    "        aps = average_precision_score(y_true, y_pred_proba)\n",
    "        metrics['ks'] = ks[0]\n",
    "        metrics['auroc'] = auroc\n",
    "        metrics['aps'] = aps\n",
    "    return metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "    * Método para imprimir todas as métricas\n",
    "'''\n",
    "def print_metrics(metrics:dict):\n",
    "    print('Matriz de Confusão:', end='\\n\\n')\n",
    "    print(pd.DataFrame(metrics['cm'], columns=['T', 'F'], index=['T', 'F']), end='\\n\\n')\n",
    "    print('Área Sob Curva ROC: %.5f'%(metrics['auroc']), end = '\\n\\n')\n",
    "    print('KS-Score: %.5f'%(metrics['ks']), end='\\n\\n')\n",
    "    print('Precisão Média de Previsão: %.5f'%(metrics['aps']), end='\\n\\n')\n",
    "    print('Precisão: %.5f'%(metrics['precision']), end='\\n\\n')\n",
    "    print('Acurácia: %.5f'%(metrics['accuracy']), end='\\n\\n')\n",
    "    print('Recall: %.5f'%(metrics['recall']), end='\\n\\n')\n",
    "    print('F1-Score: %.5f'%(metrics['f1']), end='\\n\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 307646 samples, validate on 26077 samples\n",
      "Epoch 1/100000\n",
      "307646/307646 [==============================] - 16s 51us/step - loss: 0.6378 - acc: 0.6366 - val_loss: 0.7593 - val_acc: 0.6230\n",
      "Epoch 2/100000\n",
      "307646/307646 [==============================] - 13s 43us/step - loss: 0.6220 - acc: 0.6529 - val_loss: 0.7698 - val_acc: 0.6340\n",
      "Epoch 3/100000\n",
      "307646/307646 [==============================] - 13s 43us/step - loss: 0.6139 - acc: 0.6609 - val_loss: 0.8053 - val_acc: 0.6387\n",
      "Epoch 4/100000\n",
      "307646/307646 [==============================] - 15s 49us/step - loss: 0.6064 - acc: 0.6688 - val_loss: 0.8540 - val_acc: 0.6404\n"
     ]
    }
   ],
   "source": [
    "mlp = Sequential()\n",
    "mlp.add(Dense(20, activation='tanh', input_dim=input_dim)) # Camada de entrada\n",
    "mlp.add(Dense(1, activation='sigmoid')) # Camada de saída\n",
    "mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
    "history = mlp.fit(X_train_res, y_train_res, batch_size=64, epochs=100000, \n",
    "        callbacks=[EarlyStopping(patience=3)], validation_data=(X_val, y_val[1:]));\n",
    "y_pred = mlp.predict_classes(X_test)\n",
    "y_pred_proba = mlp.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Matriz de Confusão:\n",
      "\n",
      "       T      F\n",
      "T  15604  28648\n",
      "F  17182  67001\n",
      "\n",
      "Área Sob Curva ROC: 0.62004\n",
      "\n",
      "KS-Score: 0.25743\n",
      "\n",
      "Precisão Média de Previsão: 0.74326\n",
      "\n",
      "Precisão: 0.70049\n",
      "\n",
      "Acurácia: 0.64317\n",
      "\n",
      "Recall: 0.79590\n",
      "\n",
      "F1-Score: 0.74515\n",
      "\n"
     ]
    }
   ],
   "source": [
    "metrics = calc_metrics(y_test, y_pred, y_pred_proba)\n",
    "print_metrics(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 307646 samples, validate on 26077 samples\n",
      "Epoch 1/100000\n",
      "307646/307646 [==============================] - 14s 46us/step - loss: 0.6358 - acc: 0.6384 - val_loss: 0.8444 - val_acc: 0.6152\n",
      "Epoch 2/100000\n",
      "307646/307646 [==============================] - 13s 44us/step - loss: 0.6189 - acc: 0.6558 - val_loss: 0.9041 - val_acc: 0.6194\n",
      "Epoch 3/100000\n",
      "307646/307646 [==============================] - 13s 43us/step - loss: 0.6083 - acc: 0.6670 - val_loss: 0.9163 - val_acc: 0.6262\n",
      "Epoch 4/100000\n",
      "307646/307646 [==============================] - 13s 43us/step - loss: 0.5946 - acc: 0.6787 - val_loss: 0.9438 - val_acc: 0.6370\n"
     ]
    }
   ],
   "source": [
    "mlp = Sequential()\n",
    "mlp.add(Dense(40, activation='tanh', input_dim=input_dim)) # Camada de entrada\n",
    "mlp.add(Dense(1, activation='sigmoid')) # Camada de saída\n",
    "mlp.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])\n",
    "history = mlp.fit(X_train_res, y_train_res, batch_size=64, epochs=100000, \n",
    "        callbacks=[EarlyStopping(patience=3)], validation_data=(X_val, y_val[1:]))\n",
    "y_pred = mlp.predict_classes(X_test)\n",
    "y_pred_proba = mlp.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Matriz de Confusão:\n",
      "\n",
      "       T      F\n",
      "T  15676  28576\n",
      "F  17766  66417\n",
      "\n",
      "Área Sob Curva ROC: 0.61786\n",
      "\n",
      "KS-Score: 0.28713\n",
      "\n",
      "Precisão Média de Previsão: 0.74139\n",
      "\n",
      "Precisão: 0.69918\n",
      "\n",
      "Acurácia: 0.63918\n",
      "\n",
      "Recall: 0.78896\n",
      "\n",
      "F1-Score: 0.74136\n",
      "\n"
     ]
    }
   ],
   "source": [
    "metrics = calc_metrics(y_test, y_pred, y_pred_proba)\n",
    "print_metrics(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train on 307646 samples, validate on 26077 samples\n",
      "Epoch 1/100000\n",
      "307646/307646 [==============================] - 17s 54us/step - loss: 0.1070 - acc: 0.6237 - val_loss: 0.1263 - val_acc: 0.6607\n",
      "Epoch 2/100000\n",
      "307646/307646 [==============================] - 16s 51us/step - loss: 0.1040 - acc: 0.6411 - val_loss: 0.1357 - val_acc: 0.6708\n",
      "Epoch 3/100000\n",
      "307646/307646 [==============================] - 16s 52us/step - loss: 0.1023 - acc: 0.6502 - val_loss: 0.1465 - val_acc: 0.6660\n",
      "Epoch 4/100000\n",
      "307646/307646 [==============================] - 16s 50us/step - loss: 0.1005 - acc: 0.6594 - val_loss: 0.1511 - val_acc: 0.6619\n",
      "Matriz de Confusão:\n",
      "\n",
      "      T      F\n",
      "T  2564  41688\n",
      "F  1590  82593\n",
      "\n",
      "Área Sob Curva ROC: 0.60856\n",
      "\n",
      "KS-Score: 0.32673\n",
      "\n",
      "Precisão Média de Previsão: 0.73663\n",
      "\n",
      "Precisão: 0.66457\n",
      "\n",
      "Acurácia: 0.66304\n",
      "\n",
      "Recall: 0.98111\n",
      "\n",
      "F1-Score: 0.79240\n",
      "\n"
     ]
    }
   ],
   "source": [
    "hidden_nodes = 30\n",
    "\n",
    "mlp = Sequential()\n",
    "mlp.add(Dense(40, activation='sigmoid', input_dim=input_dim)) # Camada de entrada\n",
    "for i in range(5):\n",
    "    mlp.add(Dense(hidden_nodes, activation='relu', input_dim=input_dim))\n",
    "mlp.add(Dense(1, activation='sigmoid')) # Camada de saída\n",
    "mlp.compile(optimizer='adam', loss='mean_squared_logarithmic_error', metrics=['acc'])\n",
    "history = mlp.fit(X_train_res, y_train_res, batch_size=64, epochs=100000, \n",
    "        callbacks=[EarlyStopping(patience=3)], validation_data=(X_val, y_val[1:]))\n",
    "y_pred = mlp.predict_classes(X_test)\n",
    "y_pred_proba = mlp.predict(X_test)\n",
    "metrics = calc_metrics(y_test, y_pred, y_pred_proba)\n",
    "print_metrics(metrics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
